Load the Required Libraries


> library(easypackages)
> libraries("dplyr", "reshape2", "readxl", "ggpubr","stringr", "ggplot2", 
+           "tidyverse","lme4", "data.table", "readr","plotly", "DT",
+           "pheatmap","asreml", "VennDiagram", "patchwork", "heatmaply", 
+           "ggcorrplot", "RColorBrewer", "hrbrthemes", "tm", "proustr", "arm")
Online License checked out Wed Mar 24 21:16:18 2021

1 General Information on Demo Data


Study: Rainfed Rice Trials

Experimental Design: Augmented Randomized Block Design;

  • 4 blocks

  • 1 Replication, 322 entries (un-replicated) and 12 checks (replicated).

Season: Wet-season (WS).

Location: Two NARES Locations in India.

Year: 2019.

Contact Person: Rain-fed Breeding Team


NOTE: Due to IRRI’s data policies, the actual names of lines and complete metadata information is not given in this demo report.


2 Description of Demo Data Set


  • Demo data set used in this analysis pipeline was evaluated in augmented RCBD experimental design.

  • The demo data includes data from two environments phenotyped in India for three traits grain yield, plant height and days to flowering.

  • Besides blocks, information on columns and rows is also given in the data set.

> # Remove previous work
>   rm(list=ls())
> # Upload the demo data set
>   demo.data<-read.csv(file="~/Documents/GitHub/Analysis-pipeline/Data/demo.data.csv", 
+                     header = TRUE)
> # Convert variables into appropriate data types
>   demo.data$Genotype<-as.factor(demo.data$Genotype) # Genotypes as factor
>   demo.data$Block<-as.factor(demo.data$Block) # Block as factor
>   demo.data$Row<-as.factor(demo.data$Row) # Row as factor
>   demo.data$Column<-as.factor(demo.data$Column) # Column as factor
> # View as data table
>   print_table <- function(table, ...){
+   datatable(table, extensions = 'Buttons',
+   options = list(scrollX = TRUE, 
+   dom = '<<t>Bp>',
+   buttons = c('copy', 'excel', 'pdf', 'print')), ...)
+   }
>   print_table(demo.data[, c(1, 5,6,8,9,10, 11, 12)], editable = 'cell', 
+ rownames = FALSE, caption = htmltools::tags$caption("Table: Raw data for grain yield (kg/ha), days to flowering and plant height in two environmnets.",style="color:black; font-size:130%"), filter = 'top')

3 Pre-processing of Data: Checking Quality of Phenotypic Data


  • Here in this step data will be pre-processed and quality of data will be checked, and only quality phenotypes will be advanced for downstream analysis to have more reliable and accurate estimates or predictors.

  • The steps in pre-processing involves:

    • Looking for missing data
    • Descriptive statistics for each variable
    • Heat-maps of field experimental design
    • Data visualisations as box-plots, histograms and QQ plots
    • Look and filter for outliers.

3.1 Visualize the Missing Data.


  • Here we will check whether the data has any missing values
> # Missing data count across all columns
>   demo.data[demo.data==0]<-NA # Converting any values with Zero into NA
>   na_count <-data.frame(missing.count=sapply(demo.data, function(y) sum(length(which(is.na(y))))))
> #  colSums(is.na(demo.data)) # alternative
>   na_count$Variables<-row.names(na_count)
> # Visualize missing data as bar plot
>   ggbarplot(na_count, x = "Variables", y = "missing.count",
+           fill="lightblue",
+           color = "lightblue", # Set bar border colors to white
+           x.text.angle = 45 # Rotate vertically x axis texts
+           )+
+     labs(title="Missing Data Points for all Variables",x="Variables", y = "Count")+
+     theme (plot.title = element_text(color="black", size=12,hjust=0.5, face="bold"), # add and modify the title to plot
+                  axis.title.x = element_text(color="black", size=12), # add and modify title to x axis
+                  axis.title.y = element_text(color="black", size=12))

> # Let us see which one is missing for Plant Height
>   #demo.data$Height[which(is.na(demo.data$Height))]
> # let us see the details on this
>  # demo.data[216, ]

Note: No missing data observed for each varaiable.


3.2 Descriptive Statistics


  • Here data is summarized as mean, median, minimum and maximum values, standard deviation (SD), coefficient of variation (CV) and standard error (SE) for grain yield (kg/ha), days to flowering, and plant height (cm).
> # Summary for grain yield
>   summary.Yield<-data.frame(demo.data %>% 
+   group_by(Environment)%>% 
+   summarize(Mean = mean(Yield, na.rm=TRUE),
+         Median= median(Yield, na.rm=TRUE),
+         SD =sd(Yield, na.rm=TRUE),
+         Min.=min(Yield, na.rm=TRUE),
+         Max.=max(Yield, na.rm=TRUE),
+         CV=sd(Yield, na.rm=TRUE)/mean(Yield, na.rm=TRUE)*100,
+         St.err= sd(Yield, na.rm=TRUE)/sqrt(length(Yield))
+         ))
>   summary.Yield<-data.frame(lapply(summary.Yield, function(y) if(is.numeric(y)) round(y, 2) else y)) 
> 
>   summary.Yield<-cbind(data.frame(Trait=c(rep("Yield", nrow(summary.Yield)))),summary.Yield )
> # Summary for flowering date
>   summary.flowering<-data.frame(demo.data %>% 
+   group_by(Environment)%>% 
+   summarize(Mean = mean(Days.to.flowering, na.rm=TRUE),
+         Median= median(Days.to.flowering, na.rm=TRUE),
+         SD =sd(Days.to.flowering, na.rm=TRUE),
+         Min.=min(Days.to.flowering, na.rm=TRUE),
+         Max.=max(Days.to.flowering, na.rm=TRUE),
+         CV=sd(Days.to.flowering, na.rm=TRUE)/mean(Days.to.flowering, na.rm=TRUE)*100,
+         St.err= sd(Days.to.flowering, na.rm=TRUE)/sqrt(length(Days.to.flowering))
+         ))
>   summary.flowering<-data.frame(lapply(summary.flowering, function(y) if(is.numeric(y)) round(y, 2) else y)) 
>   summary.flowering<-cbind(data.frame(Trait=c(rep("Flowering", nrow(summary.flowering)))),summary.flowering )
> # Summary for plant height
>   summary.height<-data.frame(demo.data %>% 
+   group_by(Environment)%>% 
+   summarize(Mean = mean(Height, na.rm=TRUE),
+         Median= median(Height, na.rm=TRUE),
+         SD =sd(Height, na.rm=TRUE),
+         Min.=min(Height, na.rm=TRUE),
+         Max.=max(Height, na.rm=TRUE),
+         CV=sd(Height, na.rm=TRUE)/mean(Height, na.rm=TRUE)*100,
+         St.err= sd(Height, na.rm=TRUE)/sqrt(length(Height))
+         ))
>   summary.height<-cbind(data.frame(Trait=c(rep("Height", nrow(summary.height)))),summary.height )
> # Now combine the all data summeries and view as table
>   summary.data<-rbind(summary.Yield, summary.flowering, summary.height)
>   summary.data<-data.frame(lapply(summary.data, function(y) if(is.numeric(y)) round(y, 2) else y)) 
> # Add options to print and export
>   print_table(summary.data, rownames = FALSE,caption = htmltools::tags$caption("Data summary including mean, median, standard deviation (SD), coefficient of variation (CV), and standard error (St.err) for yield (kg/ha), days to flowering and plant height (cm).", style="color:black; font-size:130%"))

Note: High CV for grain yield is observed as compared to flowering and plant height. Across the two environments CV is differes slightly



3.3 Heat Maps of the Field Experimental Design


  • Experimental design in the field for grain yield is visualized through heat map to get better idea about the field design and spatial variations in the field.

3.3.1 Showing heat map of field design under environment 1

> 
> # Under environment 1  
>   demo.data.env1<- subset(demo.data, Environment=="Env1",
+                 select =c("Block", "Column", "Yield") )
>   demo.data.env1<-data.frame(demo.data.env1%>% group_by(Block)%>% arrange(Block) %>%arrange(Column))
>   demo.data.env1<-droplevels.data.frame(demo.data.env1)
>   demo.data.env1<-reshape(demo.data.env1, idvar = "Block", 
+                 timevar = "Column", direction = "wide")
>   row.names(demo.data.env1)<-paste0("Block",  demo.data.env1$Block)
>   demo.data.env1<-data.matrix(demo.data.env1[,-1])
>   colnames(demo.data.env1) <- gsub(x = colnames(demo.data.env1),
+                                  pattern = "Yield.", replacement = "") 
> 
>   plot.gy.env1<-heatmaply(demo.data.env1, main = "Grain yield under environment 1",
+                 xlab = "Columns",
+                 ylab = "Rows",
+                 Rowv=FALSE,
+                 Colv = FALSE, cexRow = 0.8, cexCol = 0.6, na.value="white")
> plot.gy.env1

Note: Extreme blue shows low grain yield values and yellow are very high grain yield values. X axis shows the list of columns and y-axis the blocks (blocks here are also treated as rows).

3.3.2 Showing heat map of field design under environment 2

> # For Environment 2
>   demo.data.env2<- subset(demo.data, Environment=="Env2",
+                   select =c("Block", "Column", "Yield") )
>   demo.data.env2<-data.frame(demo.data.env2%>% group_by(Block)%>% arrange(Block) %>%arrange(Column))
>   demo.data.env2<-droplevels.data.frame(demo.data.env2)
>   demo.data.env2<-reshape(demo.data.env2, idvar = "Block", 
+                 timevar = "Column", direction = "wide")
>   row.names(demo.data.env2)<-paste0("Block",demo.data.env2$Block)
>   demo.data.env2<-data.matrix(demo.data.env2[,-1])
>   colnames(demo.data.env2) <- gsub(x = colnames(demo.data.env2), pattern = "Yield.", replacement = "") 
> plot.gy.env2<-heatmaply(demo.data.env2, main = "Grain yield under environment 2",
+                 xlab = "Columns",
+                 ylab = "Rows",
+                 Rowv=FALSE,
+                 Colv = FALSE, cexRow = 0.8, cexCol = 0.6, na.value="white")
> plot.gy.env2

Note: Extreme blue shows low grain yield values and yellow are very high grain yield values. X axis shows the list of columns and y-axis the blocks (blocks here are also treated as rows). Note down the high single grain yield value in Block 3.


3.4 Data Visualization


  • Here in this section we will visualize the data using Box plot, Histograms, and QQ plot.

3.4.1 Box plot

> # First let us visualize the data using boxplots
>   myboxplot<- function(dataframe,x,y){
+    aaa <- enquo(x)
+    bbb <- enquo(y)
+    dfname <- enquo(dataframe)
+    dataframe %>%
+    filter(!is.na(!! aaa), !is.na(!! bbb))  %>%
+       #group_by(!! aaa,!! bbb) %>%
+       #count() %>%
+     ggplot(aes_(fill=aaa, x=aaa, y=bbb))+ 
+     theme_classic()+
+     geom_boxplot()+
+      theme(axis.text.x = element_text(angle = 45, hjust = 1)) +# fill by timepoint to give different color
+       #scale_fill_manual(values = c("", ""))+
+       #scale_color_manual(values = c("", ""))
+       theme (plot.title = element_text(color="black", size=12,hjust=0.5, face = "bold"), # add and modify the title to plot
+     axis.title.x = element_text(color="black", size=12, face = "bold"), # add and modify title to x axis
+     axis.title.y = element_text(color="black", size=12, face="bold")) + # add and modify title to y axis
+   #scale_y_continuous(limits=c(0,15000), breaks=seq(0,15000,1000), expand = c(0, 0))+
+     theme(axis.text= element_text(color = "black", size = 10))+ # modify the axis text
+     theme(legend.title = element_text(colour="black", size=16), legend.position = "none",
+                   legend.text = element_text(colour="black", size=14))+ # add and modify the legends
+                   guides(fill=guide_legend(title="Environments"))+
+   stat_summary(fun.y=mean, geom="line", aes(group=1))  + 
+   stat_summary(fun=mean, geom="point")
+   }
> 
> # Now draw the box plot for yield
>   p1<-boxplot.yield<-myboxplot(demo.data,x=Environment,y=Yield)+
+   labs(title="",x="Environments", y = "Grain Yield (kg/ha)")+
+   stat_compare_means(method = "anova", label.x = 1.6, label.y = 10000)
>   #p1<-ggplotly(boxplot.yield)
> 
> # Now draw the box plot for flowering
>   p2<-boxplot.flowering<-myboxplot(demo.data,x=Environment,y=Days.to.flowering)+
+   labs(title="",x="Environments", y = "Days to flowering")+
+   stat_compare_means(method = "anova", label.x = 1.6, label.y = 130)
> #p2<-ggplotly(boxplot.flowering)
> 
> # Now draw the box plot height
>   p3<-boxplot.height<-myboxplot(demo.data,x=Environment,y=Height)+
+   labs(title="",x="Environments", y = "Plant Height (cm)")+
+   stat_compare_means(method = "anova", label.x = 1.6, label.y = 167)
> #p3<-ggplotly(boxplot.height)
> #p1+p2+p3
>   par(mfrow=c(1,3))
>   p1<-ggplotly(p1)
>   p2<-ggplotly(p2)
>   p3<-ggplotly(p3)
>   subplot(p1, p2, p3, nrows=1, margin = 0.05, titleY = TRUE)

Note: Non-significant difference between two environments for yield and plant height, p-value is provided on top of each plot. Outliers are present as black solid dots for all traits.

Histograms and QQ plots are also available , click the buttons below

3.4.2 Histogram plots

  • Histograms for all traits to check distribution of data.
>   par(mfrow=c(1,2))
> # For grain yield
>   envi<-unique(demo.data$Environment)
>   for(i in 1:length(envi)){
+   level_envi <- demo.data[which(demo.data$Environment==envi[i]),]
+   hist(level_envi$Yield, col = "pink", xlab="Grain yield (kg/ha)",
+   main=paste(envi[i]))
+   
+ }
Showing histograms for Grain yield, flowering and plant height.

Showing histograms for Grain yield, flowering and plant height.

> # For Flowering date
>   envi<-unique(demo.data$Environment)
>   for(i in 1:length(envi)){
+   level_envi <- demo.data[which(demo.data$Environment==envi[i]),]
+   hist(level_envi$Days.to.flowering, col = "pink", xlab="Days to flowering",
+   main=paste(envi[i]))
+   
+ }
Showing histograms for Grain yield, flowering and plant height.

Showing histograms for Grain yield, flowering and plant height.

> # For Plant height
>   envi<-unique(demo.data$Environment)
>   for(i in 1:length(envi)){
+   level_envi <- demo.data[which(demo.data$Environment==envi[i]),]
+   hist(level_envi$Height, col = "pink", xlab="Plant Height (cm)",
+   main=paste(envi[i]))
+   
+ }
Showing histograms for Grain yield, flowering and plant height.

Showing histograms for Grain yield, flowering and plant height.

Showing histograms for grain yield, flowering and plant height under two environments.

3.4.3 QQ plots

  • QQ plots are drawn to check the normality of the data. It is just to to see if our data assumptions are plausible.
  • We expect line to be straight, if it deviates then it indicates some issues with data. More information on QQ plots can be found here on this link
> ## QQ plots to check normality assumption
> # For the grain Yield
>   par(mfrow=c(1,2))
>   envi<-unique(demo.data$Environment)
>   for(i in 1:length(envi)){
+   level_envi <- demo.data[which(demo.data$Envi==envi[i]),]
+   qqnorm(level_envi$Yield, pch = 1, frame = TRUE,  main=paste(envi[i],".Yield"))
+   qqline(level_envi$Yield, col = "steelblue", lwd = 2)
+   }

> # For the days to flowering
>   par(mfrow=c(1,2))
>   envi<-unique(demo.data$Environment)
>   for(i in 1:length(envi)){
+   level_envi <- demo.data[which(demo.data$Envi==envi[i]),]
+   qqnorm(level_envi$Days.to.flowering, pch = 1, frame = TRUE,  main=paste(envi[i],".Flowering"))
+   qqline(level_envi$Days.to.flowering, col = "steelblue", lwd = 2)
+   }

> # For plant height
>   par(mfrow=c(1,2))
>   envi<-unique(demo.data$Environment)
>   for(i in 1:length(envi)){
+   level_envi <- demo.data[which(demo.data$Envi==envi[i]),]
+   qqnorm(level_envi$Height, pch = 1, frame = TRUE,  main=paste(envi[i],".Height"))
+   qqline(level_envi$Height, col = "steelblue", lwd = 2)
+   }

Grain yield looks good, check the deviation of observed values for plant height and flowering date.


3.5 Identify and Remove Outliers


Note: Outliers may drastically change the estimates, ranking (BLUPs or BLUEs) and predictions!! Further reading Resource 1; Resource 2; Resource 3

  • As the data is in Augmented RCBD with one replication, uni-variate approach will be used to find the potential outliers.
  • Briefly, we will flag out the lines that lie outside 1.5*IQR, where IQR, the ‘Inter Quartile Range’ is the difference between 75th and 25th quartiles, these are observation that are outside the whiskers in box plot. For more details on this please check this R blog.
  • So, in this section, a function named outlier.box which will first subset the data based on environment and replication, then we will get Quantiles, IQR and maximum and minimum values. Then we will flag out and extract the ones that fall outside the maximum and minimum value in the data.
  • More robust approach by integrating all the data across all the trials or locations to find potential outliers by the method called Bonferroni–Holm test is briefly described in this article. The R codes on this is available at Click the link.
  • Here we will be using Uni-variate approach as we are dealing with un-replicated data sets.
> # Univariate approach to falg out outliers in augmented un-replicated design
>   outlier.box<- function(data, trait, name){ 
+   #test<-subset(data, Envi==envir )# subsset based on environment and replications
+   #test<-droplevels.data.frame(test) # drop factor levels
+   #var_name <- eval(substitute(var),eval(data))
+   trait_name<- eval(substitute(trait),eval(data)) # evaluate trait name
+   Q3 = quantile(trait_name, 0.75, na.rm = TRUE) # get Q3
+   Q1=quantile(trait_name, 0.25, na.rm = TRUE)
+   IQR=IQR(trait_name, na.rm = TRUE)
+   Maxi<-Q3+1.5*IQR # Maximum Value
+   Mini<-Q1-1.5*IQR # Minimum Value
+   #out_flag_max<-ifelse(trait_name >Maxi , "OUTLIER_Max", ".") # Flag lines with maximum value as OUTLIER_Max
+   #out_flag_min <-ifelse(trait_name < Mini , "OUTLIER_Min", ".") 
+   out_flag<-ifelse(trait_name >Maxi | trait_name < Mini , name, ".") # Flag the outliers
+   #out<-cbind(out_flag_max,out_flag_min)
+   out_data<-cbind(data, out_flag) # Combine the original data
+   #outliers<- data[which(out_data$out_flag_max!="." |out_data$out_flag_min!="." ), c(1, 2,4,7,15)] # Extract the ones with extreame values and return only selected columns
+   #outliers<- data[which(out_data$out_flag!="."),] # Extract the ones with extreme values and return only selected columns
+   return( out_data)
+   }
>   table(demo.data$Environment)

Env1 Env2 
 380  380 
> # Now subset the data and use above function to identify the outliers
> # Subset two environment 1
>   Env1<-subset(demo.data, Environment=="Env1") # drought data
>   Env1<-droplevels.data.frame(Env1) # drop factor levels
> # Now subset the environment 2
>   Env2<-subset(demo.data, Environment=="Env2")
>   Env2<-droplevels.data.frame(Env2) # drop factor levels
> # Now identify the outliers for grain yield
>   Env1<-outlier.box(Env1,name="Outlier.GY", trait = Yield) # returns the list that has outliers for drought environment
>   Env2<-outlier.box(Env2,name="Outlier.GY", trait = Yield) # returns the list that has outliers for non-stress environment
> # Now identify the outliers for plant height
>   Env1<-outlier.box(Env1,name="Outlier.PH", trait = Height) # returns the list that has outliers for drought environment
>   Env2<-outlier.box(Env2,name="Outlier.PH", trait = Height) # returns the list that has outliers for non-stress environment
> # Now identify the outliers for days to flowering
>   Env1<-outlier.box(Env1,name="Outlier.FL", trait = Days.to.flowering) # returns the list that has outliers for drought environment
>   Env2<-outlier.box(Env2,name="Outlier.FL", trait = Days.to.flowering) # returns the list that has outliers for non-stress environment
> # Now merge all the files and save them
>   demo.data.out<-rbind(Env1, Env2)
> #Here we will inspect all the outliers and filter the extreme ones.
> #First let us change the names of last two columns
>   colnames(demo.data.out)[c(13,14,15)] <- c("out.flag.GY", "out.flag.PH", "out.flag.FL")
> # Visualize as table
>   print_table(demo.data.out[, c(1, 4, 10,11,12,13,14,15)], editable = 'cell', rownames = FALSE, caption = htmltools::tags$caption("Table: Showing the list of outliers for grain yield, plant height and flowering date in two environments.",style="color:black; font-size:130%"), filter='top')

3.6 Filter outliers

  • In this section we will convert the outliers into the missing values for all the three traits and save the file for downstream analysis.
  • Users can also convert them into mean values. See the codes given below for more details.
> # For grain yield
>   demo.data.out$Yield<- ifelse(demo.data.out$out.flag.GY==".", demo.data.out$Yield, NA)
> # For plant height
>   demo.data.out$Height<- ifelse(demo.data.out$out.flag.PH==".", demo.data.out$Height, NA)
> # For plant height
>   demo.data.out$Days.to.flowering<- ifelse(demo.data.out$out.flag.FL==".", demo.data.out$Days.to.flowering, NA)
> # We can also conver the outliers into mean values 
>     #data<-data.frame(matrix())
>     #env<- unique(TEST$Envi)
>     #for(i in 1:length(env)){
>     #data1<-TEST[which(TEST$Envi==env[i]),]
>     #data1$Yield <- ifelse(data1$out.all==".", data1$Yield, mean(data1$Yield))
>     #return(data1)
>     #data2<-rbind(data1, data)
>      #}

Box Plots after Removing Outliers

> # Now draw the box plot
>   p1<-boxplot.yield<-myboxplot(demo.data.out,x=Environment,y=Yield)+
+   labs(title="",x="Environments", y = "Grain Yield (kg/ha)")+
+   stat_compare_means(method = "anova", label.x = 1.6, label.y = 10000)
>   #p1<-ggplotly(boxplot.yield)
> # Now draw the box plot for flowering
>   p2<-boxplot.flowering<-myboxplot(demo.data.out,x=Environment,y=Days.to.flowering)+
+   labs(title="",x="Environments", y = "Days to flowering")+
+   stat_compare_means(method = "anova", label.x = 1.6, label.y = 130)
>   #p2<-ggplotly(boxplot.flowering)
> # Now draw the box plot height
>   p3<-boxplot.height<-myboxplot(demo.data.out,x=Environment,y=Height)+
+   labs(title="",x="Environments", y = "Plant Height (cm)")+
+   stat_compare_means(method = "anova", label.x = 1.6, label.y = 167)
> #p3<-ggplotly(boxplot.height)
>   par(mfrow=c(1,3))
>   p1<-ggplotly(p1)
>   p2<-ggplotly(p2)
>   p3<-ggplotly(p3)
>   subplot(p1, p2, p3, nrows=1, margin = 0.05, titleY = TRUE)

Box plot showing distribution for all traits.

Note: Seems much better now.

Descriptive Statistics after Removing Outliers

>   summary.Yield<-data.frame(demo.data.out %>% 
+   group_by(Environment)%>% 
+   summarize(Mean = mean(Yield, na.rm=TRUE),
+         Median= median(Yield, na.rm=TRUE),
+         SD =sd(Yield, na.rm=TRUE),
+         Min.=min(Yield, na.rm=TRUE),
+         Max.=max(Yield, na.rm=TRUE),
+         CV=sd(Yield, na.rm=TRUE)/mean(Yield, na.rm=TRUE)*100,
+         St.err= sd(Yield, na.rm=TRUE)/sqrt(length(Yield))
+         ))
>   summary.Yield<-data.frame(lapply(summary.Yield, function(y) if(is.numeric(y)) round(y, 2) else y)) 
> 
>   summary.Yield<-cbind(data.frame(Trait=c(rep("Yield", nrow(summary.Yield)))),summary.Yield )
> # Summary for the flowering data
>   summary.flowering<-data.frame(demo.data.out %>% 
+   group_by(Environment)%>% 
+   summarize(Mean = mean(Days.to.flowering, na.rm=TRUE),
+         Median= median(Days.to.flowering, na.rm=TRUE),
+         SD =sd(Days.to.flowering, na.rm=TRUE),
+         Min.=min(Days.to.flowering, na.rm=TRUE),
+         Max.=max(Days.to.flowering, na.rm=TRUE),
+         CV=sd(Days.to.flowering, na.rm=TRUE)/mean(Days.to.flowering, na.rm=TRUE)*100,
+         St.err= sd(Days.to.flowering, na.rm=TRUE)/sqrt(length(Days.to.flowering))
+         ))
>   summary.flowering<-data.frame(lapply(summary.flowering, function(y) if(is.numeric(y)) round(y, 2) else y)) 
>   summary.flowering<-cbind(data.frame(Trait=c(rep("Flowering", nrow(summary.flowering)))),summary.flowering )
> # Summary for plant height
>   summary.height<-data.frame(demo.data.out %>% 
+   group_by(Environment)%>% 
+   summarize(Mean = mean(Height, na.rm=TRUE),
+         Median= median(Height, na.rm=TRUE),
+         SD =sd(Height, na.rm=TRUE),
+         Min.=min(Height, na.rm=TRUE),
+         Max.=max(Height, na.rm=TRUE),
+         CV=sd(Height, na.rm=TRUE)/mean(Height, na.rm=TRUE)*100,
+         St.err= sd(Height, na.rm=TRUE)/sqrt(length(Height))
+         ))
>   summary.height<-data.frame(lapply(summary.height, function(y) if(is.numeric(y)) round(y, 2) else y)) 
>   summary.height<-cbind(data.frame(Trait=c(rep("Height", nrow(summary.height)))),summary.height )
> # Now combine the all data summeries and view as table
>   summary.data<-rbind(summary.Yield, summary.flowering, summary.height)
>   datatable(summary.data,options = list(pageLength = 7, dom = 'tip'), rownames = FALSE,caption = htmltools::tags$caption("Data summary after removing outliers.", style="color:black; font-size:130%"))
> # Save the file for analysis > write.csv(demo.data.out, file = "~/Documents/GitHub/Analysis-pipeline/Outputs/Tables/demo.data.filtered.csv", row.names = FALSE)

4 Data Analysis for Grain Yield in ASReml R Package


  • In this section, data analysis will be shown only for grain yield trait using a Linear Mixed-Model Approach in ASReml-R package.

  • Demo data analysis for grain yield will also be shown using freely available lme4 R Package package, and will be useful to the users which do not have access to the commercial ASReml-R package. See the section 5 below.

  • In general analysis is divided in two parts: First Separate analysis or step-wise analysis: In this two environments will be analyzed separately. We will be testing five mixed models correcting for experimental design factors (Blocks here) and spatial trends in the field. Then best model will be selected (model having lowest AIC value ) and used to extract the BLUPs or Breeding Values and Heritability. Second Combined analysis or Multi-environment trial (MET) analysis: In this analysis two environments will combined and analyzed together and single value BLUPs for each genotype will be extracted. The model used for combined analysis is again Mixed-Model accounting for spatial trends. We already know best spatial model (found in separate analysis above) in two environments, so this information will be used and incorporated in combined mixed-model analysis model.

  • Then we will rank the genotypes based on the BLUP values and compare it with checks.

  • We will also look at correlations between the environments.


4.1 Separate Analysis


Model 1

  • In this model we account for just experimental design factor Block and no spatial effects.

  • Note we used block as fixed effect in most cases due to less than 5 degrees of freedom. If you are interested to know whether to use block fixed or random in model I highly recommend this Blocks Fixed or Random?

  • Also Note row and block is same in all the trials. So it does not matter whether we use row or block in model.

  • Best linear unbiased predictors (BLUPs) extracted here is equivalent to breeding values


\[ Y_{ij}= \mu+G_{i} + B_{j} + \epsilon_{ij}\\ Y_{ij}= \text{ is the effect of $i$th genotype in $j$th block} \\ \mu= \text {overall mean}\\ G_{i}=\text{random effect of the $i$th genotype}\\ B_{k}= \text {fixed effect of $k$th block}\\ \varepsilon_{ij}=\text{residual error}\\ \text{here we assume errors are independent and identically distributed }\epsilon\sim \text{$iid$N}(0,\sigma_\epsilon^2)\\ \]

R script in Asreml

model1<-asreml(fixed=trait~Block, random=~Genotype, na.method=“include”, data=data)

  • In the above model block is treated as fixed effect and genotype as random effect

Model 2

  • In this model we account for just experimental design factor block, and column no spatial effects.

\[ Y_{ijk}= \mu+G_{i} + B_{j}+ C_{k} + \epsilon_{ijk}\\ Y_{ijk}= \text{ is the effect of $i$th genotype in $j$th block and $k$th column} \\ \mu= \text {overall mean}\\ G_{i}=\text{random effect of the $i$th genotype}\\ B_{j}= \text {random efect of $j$th block}\\ C_{k}= \text {random efect of $k$th column}\\ \varepsilon_{ijk}=\text{residual error}\\ \text{here we assume residuals are independent and identically distributed }\epsilon\sim \text{$iid$N}(0,\sigma_\epsilon^2)\\ \]

R script in Asreml

model2<-asreml(fixed=trait~1, random=~Column+Block+Genotype, na.method=“include”, data=data)

  • Block, column and genotype all are used as random effects.

Model 3

  • In this model we account block, and column effects, and for spatial trends/effects i.e, correlated residuals both across rows and columns.

\[ Y_{ijk}= G_{i} + B_{j}+ C_{k} + \epsilon_{ijk}\\ Y_{ijk}= \text{ is the effect of $i$th genotype in $j$th block and $k$th column} \\ \mu= \text {overall mean}\\ G_{i}=\text{random effect of the $i$th genotype}\\ B_{j}= \text {random efect of $j$th block}\\ C_{k}= \text {random efect of $k$th column}\\ \epsilon_{ijk}=\text {residual error}\\ \]

here, we assume \(\epsilon\) is a random effect that represents correlated residuals based on the distance between plots along both the rows and columns, where, \(\epsilon\sim N(0,\mathbf{R})\) and R is the covariance matrix of \(\epsilon\). The difference between this model and model 1 and model 2 described above is the structure of the covariance residuals \(R ={\sigma_\epsilon^2\ \Sigma}_c\left(\rho_c\right)\otimes\Sigma_r\left(\rho_r\right)\). \(\sigma_\epsilon^2\) is the variance of spatially dependent residual; \({\Sigma}_c\left(\rho_c\right)\) and \(\ \Sigma_r\left(\rho_r\right)\) represents the first-order autoregressive correlation matrices and \(\rho_{c\ }\) and \(\rho_{r\ }\) are the autocorrelation parameters for the columns and rows; \(\otimes\) represents the Kronecker product between separable auto-regressive processes of the first order in the row-column dimensions. For more details on this, these references would be helpful Gilmour et al., 1997; Gogel et al., 2018; Andrade et al., 2020; Bernardeli et al.202

R script in Asreml

model3<-asreml(fixed=trait~1, random=~Column+Block+Genotype,residual =~ar1v(Block):ar1(Column), na.method=“include”, data=data)

  • Block, column and genotype all are used as random effects.

Model 4

  • In this model we account only for fixed block effect and spatial trends i.e, correlated residuals both across blocks or rows and columns in row and column directions both.

\[ Y_{ijk}= G_{i} + B_{j} + \epsilon\\ Y_{ijk}= \text{ is the effect of $i$th genotype in $j$th block} \\ \mu= \text {overall mean}\\ G_{i}=\text{random effect of the $i$th genotype}\\ B_{k}= \text {fixed efect of $j$th block}\\ \epsilon_{ij}=\text {residual error}\\ \] here, we assume \(\epsilon\) is a random effect that represents correlated residuals based on the distance between plots along both the rows and columns, where, \(\epsilon\sim N(0,\mathbf{R})\) and R is the covariance matrix of \(\epsilon\). Here, \(R ={\sigma_\epsilon^2\ \Sigma}_c\left(\rho_c\right)\otimes\Sigma_r\left(\rho_r\right)\). \(\sigma_\epsilon^2\) is the variance of spatially dependent residual; \({\Sigma}_c\left(\rho_c\right)\) and \(\ \Sigma_r\left(\rho_r\right)\) represents the first-order autoregressive correlation matrices and \(\rho_{c\ }\) and \(\rho_{r\ }\) are the autocorrelation parameters for the columns and rows; \(\otimes\) represents the Kronecker product between separable auto-regressive processes of the first order in the row-column dimensions.

R script in Asreml

model4<-asreml(fixed=trait~Block, random=~Genotype,residual =~ar1v(Block):ar1(Column), na.method=“include”, data=data)

  • Block is fixed and genotype as random effects.

Model 5

  • In this model we account for spatial trends i.e, correlated residuals across columns only.

\[ Y_{ijk}= G_{i} + B_{j}+ C_{k} + \epsilon\\ Y_{ijk}= \text{ is the effect of $i$th genotype and $j$th block in $k$th column} \\ \mu= \text {overall mean}\\ G_{i}=\text{random effect of the $i$th genotype}\\ B_{k}= \text {fixed efect of $k$th block}\\ C_{k}= \text {efect of $k$th column}\\ \epsilon_{ijk}=\text {residual error}\\ \]

here, we assume \(\epsilon\) is a random effect that represents correlated residual across columns only, where, \(\epsilon\sim N(0,\mathbf{R})\) and R is the covariance matrix of \(\epsilon\), and \(\mathbf{R}={\sigma_\epsilon^2\ \Sigma}_c\left(\rho_c\right)\otimes I_r\). \(\sigma_\epsilon^2\) is the variance of spatially dependent residual; \({\Sigma}_c\left(\rho_c\right)\) represents the first-order autoregressive correlation matrices and \(\rho_{c\ }\) the autocorrelation parameters for the columns only; \(I_r\) represents independently and identically distributed variance structure for rows.

R script in Asreml

model5<-asreml(fixed=trait~Block, random=~Genotype,residual =~idv(Block):ar1(Column), na.method=“include”, data=data))

  • Block as fixed effect and genotype as random effects.

4.1.1 Best Model for Grain Yield

  • Here we will first built a function to run all the five models.
  • These five models will be run separately on environment 1 and environment 2 data.
  • Best model will be selected based on AIC and residual plot information, and later used to extract BLUPs in two environments.
  • Please click on the code button on right-side to see the model with lowest AIC value.

Read data filtered for outliers and built function for running models


Best Model for Grain Yield Under Environment 1

> # Now run above function to test various models for both environments and traits
> # For grain yield under environment 1
>   output.env1.gy<-my.asreml(demo.env1, trait = "Yield")
Online License checked out Wed Mar 24 21:16:30 2021
Model fitted using the gamma parameterization.
ASReml 4.1.0 Wed Mar 24 21:16:30 2021
          LogLik        Sigma2     DF     wall    cpu
 1     -2756.464      743920.6    376 21:16:30    0.0
 2     -2748.409      609062.8    376 21:16:30    0.0
 3     -2737.023      384902.4    376 21:16:30    0.0
 4     -2722.847      144448.7    376 21:16:30    0.0
 5     -2718.047       66552.3    376 21:16:30    0.0
 6     -2718.015       62713.2    376 21:16:30    0.0
 7     -2718.015       62655.8    376 21:16:30    0.0

Model fitted using the gamma parameterization.
ASReml 4.1.0 Wed Mar 24 21:16:32 2021
          LogLik        Sigma2     DF     wall    cpu
 1     -2776.060      690216.5    379 21:16:32    0.0 (1 restrained)
 2     -2765.985      573570.7    379 21:16:32    0.0
 3     -2753.990      362051.8    379 21:16:32    0.0
 4     -2739.469      131220.2    379 21:16:32    0.0
 5     -2732.670       55797.3    379 21:16:32    0.0 (2 restrained)
 6     -2732.319       59030.0    379 21:16:32    0.0 (2 restrained)
 7     -2732.310       60102.0    379 21:16:32    0.0 (2 restrained)
 8     -2732.309       60081.9    379 21:16:32    0.0 (2 restrained)
 9     -2732.309       60081.9    379 21:16:32    0.0 (1 restrained)
10     -2732.309       60081.9    379 21:16:32    0.0

Model fitted using the gamma parameterization.
ASReml 4.1.0 Wed Mar 24 21:16:33 2021
          LogLik        Sigma2     DF     wall    cpu
 1     -2777.951      713886.2    379 21:16:33    0.0 (1 restrained)
 2     -2766.215      584008.2    379 21:16:33    0.0
 3     -2753.507      366319.0    379 21:16:33    0.0 (1 restrained)
 4     -2741.336      168486.5    379 21:16:33    0.0 (2 restrained)
 5     -2732.910       73123.0    379 21:16:33    0.0 (2 restrained)
 6     -2730.524       59868.9    379 21:16:33    0.0 (2 restrained)
 7     -2730.215       62109.7    379 21:16:33    0.0 (1 restrained)
 8     -2730.201       63635.3    379 21:16:33    0.0 (1 restrained)
 9     -2730.171       64413.5    379 21:16:33    0.0
10     -2730.169       64416.5    379 21:16:33    0.0 (1 restrained)
11     -2730.160       64769.9    379 21:16:33    0.0
12     -2730.159       64759.5    379 21:16:33    0.0 (1 restrained)
13     -2730.156       64908.0    379 21:16:33    0.0

Model fitted using the gamma parameterization.
ASReml 4.1.0 Wed Mar 24 21:16:34 2021
          LogLik        Sigma2     DF     wall    cpu
 1     -2756.839      757812.6    376 21:16:34    0.0
 2     -2748.188      620990.1    376 21:16:34    0.0
 3     -2736.723      400946.6    376 21:16:34    0.0
 4     -2723.405      165883.1    376 21:16:34    0.0 (1 restrained)
 5     -2717.533       86647.5    376 21:16:34    0.0 (1 restrained)
 6     -2715.805       64432.4    376 21:16:34    0.0
 7     -2715.725       69904.3    376 21:16:34    0.0
 8     -2715.607       69894.4    376 21:16:34    0.0
 9     -2715.596       71087.8    376 21:16:34    0.0
10     -2715.589       71117.9    376 21:16:34    0.0
11     -2715.587       71269.3    376 21:16:34    0.0
12     -2715.586       71242.1    376 21:16:34    0.0
13     -2715.586       71281.5    376 21:16:34    0.0

Model fitted using the gamma parameterization.
ASReml 4.1.0 Wed Mar 24 21:16:36 2021
          LogLik        Sigma2     DF     wall    cpu
 1     -2755.285      746909.5    376 21:16:36    0.0
 2     -2747.278      613923.8    376 21:16:36    0.0
 3     -2736.080      393579.7    376 21:16:36    0.0
 4     -2722.492      157334.6    376 21:16:36    0.0
 5     -2717.491       75123.5    376 21:16:36    0.0
 6     -2717.335       66304.9    376 21:16:36    0.0
 7     -2717.333       65792.7    376 21:16:36    0.0

> # Extract the name of model that has lower AIC
>   best.model.env1.gy<-colnames(output.env1.gy)[apply(output.env1.gy,1,which.min)]
>   best.model.env1.gy
[1] "model1"

Click on code icon on right side to see which model is best


Best Model for Grain Yield Under Environment 2

> # For grain yield under environment 2
>   output.env2.gy<-my.asreml(demo.env2, trait = "Yield")
Model fitted using the gamma parameterization.
ASReml 4.1.0 Wed Mar 24 21:16:40 2021
          LogLik        Sigma2     DF     wall    cpu
 1     -2728.248      691772.4    374 21:16:40    0.0
 2     -2722.699      574840.4    374 21:16:40    0.0
 3     -2713.184      357862.3    374 21:16:40    0.0
 4     -2697.903      114819.9    374 21:16:40    0.0
 5     -2693.571       54154.3    374 21:16:40    0.0
 6     -2693.563       52509.2    374 21:16:40    0.0

Model fitted using the gamma parameterization.
ASReml 4.1.0 Wed Mar 24 21:16:41 2021
          LogLik        Sigma2     DF     wall    cpu
 1     -2748.855      645634.8    377 21:16:41    0.0
 2     -2741.746      546605.1    377 21:16:41    0.0
 3     -2731.373      344533.1    377 21:16:41    0.0
 4     -2715.475      103539.8    377 21:16:41    0.0
 5     -2709.982       40203.5    377 21:16:41    0.0
 6     -2709.815       38285.0    377 21:16:41    0.0
 7     -2709.810       38502.0    377 21:16:41    0.0

Model fitted using the gamma parameterization.
ASReml 4.1.0 Wed Mar 24 21:16:42 2021
          LogLik        Sigma2     DF     wall    cpu
 1     -2747.192      655268.4    377 21:16:42    0.0
 2     -2738.579      543567.0    377 21:16:42    0.0
 3     -2726.797      339750.4    377 21:16:42    0.0 (1 restrained)
 4     -2710.762      138196.2    377 21:16:42    0.0 (1 restrained)
 5     -2708.409       75708.1    377 21:16:42    0.0 (1 restrained)
 6     -2706.031       49119.0    377 21:16:42    0.0 (1 restrained)
 7     -2705.050       66933.3    377 21:16:42    0.0 (1 restrained)
 8     -2704.969       70833.9    377 21:16:42    0.0 (1 restrained)
 9     -2704.957       72307.1    377 21:16:42    0.0
10     -2704.956       72709.5    377 21:16:42    0.0

Model fitted using the gamma parameterization.
ASReml 4.1.0 Wed Mar 24 21:16:43 2021
          LogLik        Sigma2     DF     wall    cpu
 1     -2726.128      695306.5    374 21:16:43    0.0
 2     -2719.381      574462.1    374 21:16:43    0.0
 3     -2708.377      357139.5    374 21:16:43    0.0 (1 restrained)
 4     -2692.387      140077.6    374 21:16:43    0.0 (1 restrained)
 5     -2690.407       76752.3    374 21:16:43    0.0
 6     -2688.296       54569.5    374 21:16:43    0.0
 7     -2687.674       70417.1    374 21:16:43    0.0
 8     -2687.632       74106.1    374 21:16:43    0.0
 9     -2687.626       75581.3    374 21:16:43    0.0
10     -2687.626       75939.8    374 21:16:43    0.0

Model fitted using the gamma parameterization.
ASReml 4.1.0 Wed Mar 24 21:16:44 2021
          LogLik        Sigma2     DF     wall    cpu
 1     -2725.104      687241.7    374 21:16:44    0.0
 2     -2718.935      571044.5    374 21:16:44    0.0
 3     -2708.342      356549.1    374 21:16:44    0.0 (1 restrained)
 4     -2692.660      141531.9    374 21:16:44    0.0 (1 restrained)
 5     -2690.215       78139.6    374 21:16:44    0.0
 6     -2688.383       60226.1    374 21:16:44    0.0
 7     -2688.064       67226.9    374 21:16:44    0.0
 8     -2687.877       77539.6    374 21:16:44    0.0
 9     -2687.868       79880.3    374 21:16:44    0.0
10     -2687.867       80421.8    374 21:16:44    0.0

> # Extract the name of model that has lower AIC
>   best.model.env2.gy<-colnames(output.env2.gy)[apply(output.env2.gy,1,which.min)]
>   best.model.env2.gy 
[1] "model5"

Click on code icon on right side to see which model is best


<>

4.1.2 Extract BLUPs and Heritabilities

  • Here in this section we will select and run the best model and extract BLUPs (know more on BLUPs or BLUEs here).

  • Best model will be selected based on lower AIC values and also residual plot. Lower the AIC value better is the model. For example for grain yield under environment 1 Model 1 has lower AIC values, however, we will pick Model 4 as best model because its AIC values is very close to Model 1 but residual plot lookes much better than Model 1**

  • We will also calculate the heritability’s. Note we are dealing with trials that is un-replicated and has missing data, so we cannot use basic formula as: \(h{^2}= \frac{\sigma^{2}g}{\sigma^{2}g+\sigma^{2}e}\) to calculate heritability. Plus when we are dealing with spatial models or complex models, calculating heritability with this method is not appropriate.

  • Alternative method as described by Piepho and M€ohring (2007) is more appropriate for complex residual structures and unbalanced experimental designs. The equation is: \(H_{C}=1-\frac{\overline{V}_{BLUP}}{2\sigma^{2}g}\). Where \(\overline{V}_{BLUP}\) is mean variance difference of difference of two BLUP and \(\sigma^{2}g\) is variance of genotypes. Note this definition of heritability is related to reliability of breeding value predictions. For more details please check the Book: Genetic Data Analysis for Plant and Animal Breeding; Chapter 7 and this beautiful resource Summary of heritability equations

  • So in this section a developed function called my.blup which will be used to extract BLUPs and then heritability will be calculated by method described above.

> # Now select the best model to extract BLUPs for each trait and environment
> # First we will build again a function to extract BLUPs and heritability from best model
>   my.blup<-function(model, data){
+   #p<-plot(varioGram(model))
+   # Now use predict function to return the list of three containing predicted values, and average S.E differnces
+   predicted.values<-predict(model, "Genotype", sed=T)
+   # Extract the BLUPs from above
+   blups<-predicted.values$pvals
+   # Now let us add the line designation names
+   # BLUPs with line names 
+   #blups<-merge(data[,c(7,8,13,14)],blups, by="Genotype")
+   #blups<-blups[!duplicated(blups$Genotype), ]
+   # Calculate the heritability
+   # Simply based on the variance components
+   #heritability<-vpredict(model5, hA ~  V1/(V1 + V2+V3+V4+V5))
+   #H2<-heritability[1,1]*100
+   #the Reliazied heritability that is appropriate for complex residual structures and unbalanced experimental designs introduced by Cullis et al. (2006) and discussed by Piepho and M€ohring (2007):
+   # page 235
+   # First let us extract the vBLUp difference
+   avgsd<-predicted.values$avsed[2]
+   h2<- (1-((predicted.values$avsed[2])^2/((summary(model)$varcomp[1,1])*2)))*100
+   return(list(Heritability=h2, BLUPs=blups))
+   }
> # Now for grain yield under environment 1
> # Fitting Model 4 as its best.
>   model4.gy.env1<-asreml(fixed=Yield~Block, random=~Genotype,
+                  residual =~ar1v(Block):ar1(Column),  na.method="include", data=demo.env1)
> # BLUPs and heritability for grain yield under Environment 1
>   out.gy.env1<-my.blup(model4.gy.env1, demo.env1)
>   out.gy.env1$Heritability
>   blups.env1.gy<-out.gy.env1$BLUPs
>   names(blups.env1.gy)[c(2,3)]<-c("blups.gy", "std.er.gy")
>   
> # Now for grain yield under environment 2
>   model5.gy.env2<-asreml(fixed=Yield~Block, random=~Genotype,
+                residual =~idv(Block):ar1(Column),  na.method="include", data=demo.env2)
> # BLUPs and heritability for grain yield under drought
>   out.gy.env2<-my.blup(model5.gy.env2, demo.env2)
>   out.gy.env2$Heritability
>   blups.env2.gy<-out.gy.env2$BLUPs
> # rename the columns and select appropriate columns
>   names(blups.env2.gy)[c(2,3)]<-c("blups.gy", "std.er.gy")
> # Now let us combine all the BLUPs dataframes into one and save
>   
> # Let us add environment information column to the extracted BLUEs data file
>   blups.env1<-data.frame(cbind(data.frame(Environment=c(rep("Env1",nrow(blups.env1.gy)))), blups.env1.gy))
> # Now add line.type information
>   blups.env1<-merge(demo.env1[,c(4,9)],blups.env1, by="Genotype")
>   blups.env1<-blups.env1[!duplicated(blups.env1$Genotype), ]
> # Now combine environment 2 information
>   blups.env2<-data.frame(cbind(data.frame(Environment=c(rep("Env2",nrow(blups.env2.gy)))), blups.env2.gy))
> # Now add the genotype names name and line.type
>   blups.env2<-merge(demo.env2[,c(4,9)],blups.env2, by="Genotype")
>   blups.env2<-blups.env2[!duplicated(blups.env2$Genotype), ]
> # Now combine all
>   blups.all<-rbind(blups.env1[,-6], blups.env2[,-6])
> # Round all the columns containing blups and standard errors
>   blups.all<-data.frame(lapply(blups.all, function(y) if(is.numeric(y)) round(y, 2) else y)) 
> # Save the blups in the directory
>   write.csv(blups.all, 
+           file="~/Documents/GitHub/Analysis-pipeline/Outputs/Tables/blups.all.seperate.csv",
+           row.names = FALSE)

Summary and Heritability for Grain Yield

> # Calcualate summary and heritability
> # Save heritability as vector
>   Heritability<-c(out.gy.env1$Heritability, out.gy.env2$Heritability)
> # 
>   summary.gy<-cbind(data.frame(blups.all%>% 
+         group_by(Environment)%>% 
+   summarize(Mean = mean(blups.gy, na.rm=TRUE),
+         Median= median(blups.gy, na.rm=TRUE),
+         SD =sd(blups.gy, na.rm=TRUE),
+         Min.=min(blups.gy, na.rm=TRUE),
+         Max.=max(blups.gy, na.rm=TRUE))
+         ),Heritability) 
> # Round 
>   summary.gy<-data.frame(lapply(summary.gy, function(y) if(is.numeric(y)) round(y, 2) else y)) 
> # Plot the data.tables
>   print_table(summary.gy, rownames = FALSE,caption = htmltools::tags$caption("Data summaries of BLUPs and heritability for grain yield (kg/ha) in two environments.", style="color:black; font-size:130%"))

Note: Heritability under two environments is very close.

BLUPs for Grain Yield

> # BLUPs table
>   print_table(blups.all[, c(1,2,3,4)],editable = 'cell', rownames = FALSE,caption = htmltools::tags$caption("BLUPs along with standard errors for grain yield in Environment 1 (Env1) and Environment (Env2).", style="color:black; font-size:130%"), filter = 'top')

4.2 Multi-environment trial (MET)/Combined Analysis

  • Here in this section MET analysis for two environments will be performed and single value BLUP for each genotype will be predicted. We will also calculate combined heritability.

  • With the separate analysis done above we know which best spatial model works in each trial. We will directly borrow this information and incorporate into our combined model analysis, so we do not need to test various models.

*Click button below for more model description:

Combined or MET model


\[ Y_{ijk}= \mu+G_{i} + E_{j}+G_{i}*E_{j}+ B_{k}(E_{j}) + \epsilon_{ijk}\\ Y_{ijk}= \text{ is the effect of $i$th genotype in $j$th environment and $k$th block} \\ \mu= \text {overall mean}\\ G_{i}=\text{random effect of the $i$th genotype}\\ E_{j}=\text{fixed effect of the $j$th environment}\\ B_{k}= \text {random efect of $k$th block nested in $j$th environment}\\ \epsilon_{ijk}=\text{residual error}\\ \]


here, we assume \(\epsilon\) is a random effect that represents correlated residual across columns and or rows depending upon the correlation structure that was fitted. Further it should be noted we applied a separate spatial trend for each environment in the combined model, and spatial model was decided from the seperate analysis done above. See the Asreml code below:

R script in Asreml for grain yield

met.gy<-asreml(Yield ~Environment,random= ~Genotype +Environment:Genotype+Block:Environment,residual = dsum(ar1v(Block):ar1(Column)+ ~idv(Block):ar1(Column)|Environment,levels = list(c(1), c(2))), na.method =“include”, data = crurrs.data.out)

  • Genotype,block were treated as random effects and environments as fixed effect .

> # First we will read the filtered data set containing data from two environments. 
>   if(exists('demo.data.out') && is.data.frame(get('demo.data.out'))){
+     demo.data.out=demo.data.out
+   }else{
+     demo.data.out<-read.csv(file="~/Documents/GitHub/Analysis-pipeline/Outputs/Tables/demo.data.filtered.csv",
+                             header = TRUE) 
+   }
> # In case checks are used as fixed effects  
> # Create two new columns if design is augmented. 
> # Adding a new column 'new' that will help treat genotypes as separate
> #demo.data.out$Genotype<-as.numeric(demo.data.out$Genotype)
> #demo.data.out<- within(demo.data.out,{
> #new <- ifelse(demo.data.out$Line.type=="check", 0, 1)
>   #})
>   # Adding a new column 'Genotypec' that will help us group all the new entries
>   # in a single pool, yet treat all checks as separate
>   #demo.data.out<- within(demo.data.out, {
>    # Genotypec <- ifelse(demo.data.out$new > 0, 999, demo.data.out$Genotype)
>   #})
> # Arrange the the data set before running it
>   demo.data.out<-data.frame(demo.data.out%>% group_by(Environment)%>%arrange(Row, Column))
>   demo.data.out<-demo.data.out%>% arrange(Environment)
>   columns<-c("Plot", "Genotype", "Replication", "Block", "Row", "Column", "Year")
>   demo.data.out[, columns]<-lapply(columns, function(x) as.factor( demo.data.out[[x]]))
>   demo.data.out$Yield<-as.numeric(demo.data.out$Yield)
>   demo.data.out$Environment<-as.factor(demo.data.out$Environment)

Click on code on right-side to see detailed models and how heritability and BLUPs are extracted

> # Here we will perform combined analysis of two environmenys
> # Spatial variation model will be used, model will be selected based on previous analysis done seperately
> # For GRAIN YIELD
>     met.gy<-asreml(Yield ~1,random= ~Genotype +Environment:Genotype+Block:Environment,
+           residual =~dsum(~ar1v(Block):ar1(Column)+idv(Block):ar1(Column)|Environment,levels = list(c(1), c(2))), na.method ="include", data = demo.data.out)
Multi-section model using the sigma parameterization.
ASReml 4.1.0 Wed Mar 24 21:16:48 2021
          LogLik        Sigma2     DF     wall    cpu
 1     -5984.741           1.0    757 21:16:49    0.1
 2     -5770.608           1.0    757 21:16:49    0.1
 3     -5542.038           1.0    757 21:16:49    0.1 (1 restrained)
 4     -5414.267           1.0    757 21:16:49    0.1
 5     -5332.976           1.0    757 21:16:49    0.1
 6     -5298.165           1.0    757 21:16:49    0.1
 7     -5294.841           1.0    757 21:16:49    0.1
 8     -5293.464           1.0    757 21:16:49    0.1
 9     -5293.219           1.0    757 21:16:49    0.1
10     -5293.204           1.0    757 21:16:49    0.1
11     -5293.200           1.0    757 21:16:49    0.1
12     -5293.199           1.0    757 21:16:49    0.1
13     -5293.199           1.0    757 21:16:50    0.1
> 
>   #aic<- -2*(model.met2$loglik-length(model.met2$vparameters));aic
>   predicted.gy<-predict(met.gy, "Genotype", sed=T)
Multi-section model using the sigma parameterization.
ASReml 4.1.0 Wed Mar 24 21:16:50 2021
          LogLik        Sigma2     DF     wall    cpu
 1     -5293.199           1.0    757 21:16:50    0.1
 2     -5293.199           1.0    757 21:16:50    0.1
 3     -5293.199           1.0    757 21:16:50    0.2
> # Extract the BLUPs from above
>   blups.gy.met<-predicted.gy$pvals
>   names(blups.gy.met)[c(2,3)]<-c("blups.gy", "std.er.gy")
> # Now calculate heritability
>   h2.gy.met<- (1-((predicted.gy$avsed[2])^2/((summary(met.gy)$varcomp[2,1])*2)))*100;h2.gy.met
    mean 
87.10002 
> # Now add designation and line.type to blup file
>  # Now add the genotype name and line.type
>   blups.met<-merge(demo.env1[,c(4,9)],blups.gy.met[,-4], by="Genotype")
>   blups.met<-blups.met[!duplicated(blups.met$Genotype), ] 
>   blups.met<-data.frame(lapply(blups.met, function(y) if(is.numeric(y)) round(y, 2) else y))

BLUPs for grain yield from combined analysis

> # BLUPs table
>   print_table(blups.met[, c(1,2, 3,4)],editable = 'cell', rownames = FALSE,caption = htmltools::tags$caption(" BLUPs along with standard errors for grain yield (kg/ha) from MET analysis", style="color:black; font-size:130%"))
> # Save the blup file
>   write.csv(blups.met, 
+           file="~/Documents/GitHub/Analysis-pipeline/Outputs/Tables/blups.combined.csv",
+           row.names = FALSE)

Combined Data Summary and Heritability

>   summary.met.gy<-data.frame(blups.met%>% 
+         group_by(Line.type)%>% 
+   summarize(Mean = mean(blups.gy, na.rm=TRUE),
+         Median= median(blups.gy, na.rm=TRUE),
+         SD =sd(blups.gy, na.rm=TRUE),
+         Min.=min(blups.gy, na.rm=TRUE),
+         Max.=max(blups.gy, na.rm=TRUE),
+         Heritability=h2.gy.met)
+           )
>   summary.met.gy<-data.frame(lapply(summary.met.gy, function(y) if(is.numeric(y)) round(y, 2) else y))
>   summary.met.gy[1,7]<-"-"
>   print_table(summary.met.gy, rownames = FALSE)

Note: Heritability of grain yield in combined analysis is low as compared to seperate analysis. Also, check the mean and higher value of genotype as compared to checks.


5 Phenotypic Data Analysis in lme4 R Package


  • Here in this section phenotypic data analysis is performed in an open source R package called lme4. More on this R package can be found here lme4 Tutorial 1, and lme4 Tutorial 2.

  • The purpose of this section is to repeat the phenotypic data analysis in lme4 as ASReml R package is commercial package and may not available for all the users.

  • Filtered data set will be used, same one used in ASReml R package to perform the analysis in lme4.

  • ANOVA, variance components, BLUPS, BLUES and heritability is extracted for the results part.

5.1 Upload the Filtered Phenotypic Data

> # Read the saved csv file, if working directly 
>   if(exists('demo.data.out') && is.data.frame(get('demo.data.out'))){
+   demo.data.out=demo.data.out
+   }else{
+   demo.data.out<-read.csv(file="~/Documents/GitHub/Analysis-pipeline/Outputs/Tables/demo.data.filtered.csv",
+                             header = TRUE)
+ # factor conversion if below are not in factors 
+   columns<-c("Plot", "Genotype", "Replication", "Block", "Row", "Column", "Year")
+   demo.data.out[, columns]<-lapply(columns, function(x) as.factor(demo.data.out[[x]]))
+   demo.data.out$Yield<-as.numeric(demo.data.out$Yield)
+   demo.data.out$Height<-as.numeric(demo.data.out$Height)
+   }
> # Now we will subset the environments and traits for analysis
>   demo.data.out<-data.frame(demo.data.out%>% group_by(Environment)%>%arrange(Row, Column)) # arrange by row and column
>   demo.data.out<-data.frame(demo.data.out%>% arrange(Environment)) # Arrange by environment
>   demo.env1<-subset(demo.data.out, Environment=="Env1") # Environment 1
>   demo.env1<-droplevels.data.frame(demo.env1)
>   demo.env2<-subset(demo.data.out, Environment=="Env2") # Environment 2
>   demo.env2<-droplevels.data.frame(demo.env2)

5.2 Seperate Analayis

Model 1.lme4


\[ Y_{ij}= \mu+G_{i} + B_{j} + \varepsilon_{ij}\\ Y_{ij}= \text{ is the effect of $i$th genotype in $j$th block}\\ \mu= \text {overall mean}\\ G_{i}=\text{random effect of the $i$th genotype}\\ B_{j}= \text {fixed effect of $j$th block}\\ e_{ij}=\text{error}\\ \text{here we assume residuals are independent and identically distributed }\varepsilon\sim \text{$iid$N}(0,\sigma_e^2)\\ \]


  • The model described above is equivalent to model1 described in ASReml R package analysis.
> # Now apply model
>   model1.lme4<-lmer(Yield~Block+(1|Genotype), data =demo.env1)

5.2.1 Results

  • Here we will summarize the results using summary() function. The first few lines of output indicate that the model was fitted by REML as well as the value of the REML criterion. The second piece of the summary output provides information regarding the random-effects and residual variation. The third piece of the summary output provides information regarding the fixed-effects and the fourth piece of summary output provides information regarding the correlation of fixed effects.
> # Summarise the results
>   summary(model1.lme4)
Linear mixed model fit by REML ['lmerMod']
Formula: Yield ~ Block + (1 | Genotype)
   Data: demo.env1

REML criterion at convergence: 6127.1

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-1.77102 -0.20285  0.00171  0.19963  1.82542 

Random effects:
 Groups   Name        Variance Std.Dev.
 Genotype (Intercept) 747281   864.5   
 Residual              62655   250.3   
Number of obs: 380, groups:  Genotype, 344

Fixed effects:
            Estimate Std. Error t value
(Intercept)  4061.23      70.00  58.015
Block2       -100.16      82.48  -1.214
Block3         14.68      82.48   0.178
Block4         21.22      82.48   0.257

Correlation of Fixed Effects:
       (Intr) Block2 Block3
Block2 -0.589              
Block3 -0.589  0.500       
Block4 -0.589  0.500  0.500

5.2.1.1 Extract variance components

>   Ve<- VarCorr(model1.lme4)
>   Ve
 Groups   Name        Std.Dev.
 Genotype (Intercept) 864.45  
 Residual             250.31  

5.2.1.2 Plot the residual vs fitted plot (check for homoscedasticicty)

>   plot(fitted(model1.lme4), resid(model1.lme4), type="pearson")
>   abline(0,0, col="blue")

> # Plot QQ plot
>   qqnorm(resid(model1.lme4))

> # Residual plot
>   plot(residuals(model1.lme4,type="pearson"), main='Model residuals', 
+   ylab='Pearson residual value')

5.2.1.3 ANOVA

> # ANOVA
>   anova(model1.lme4)
Analysis of Variance Table
      npar Sum Sq Mean Sq F value
Block    3 178059   59353  0.9473

5.2.1.4 Extract the Fixed effects

>   BLUEs<-fixef(model1.lme4)
>   BLUEs
(Intercept)      Block2      Block3      Block4 
 4061.23001  -100.16316    14.67836    21.22442 

5.2.1.5 Extract the Random effects

> # Extract the Random effects
>   BLUPs<-data.frame(Blups.yield=ranef(model1.lme4)$Genotype)
>   GV<-data.frame(BLUps.GY=coef(model1.lme4)$Genotype[,1]) #Genotype values (Blups +Intercept)

5.2.2 Heritability

> # Extract the variance components
>   Ve<- data.frame (VarCorr(model1.lme4))
>   Ve
       grp        var1 var2      vcov    sdcor
1 Genotype (Intercept) <NA> 747281.11 864.4542
2 Residual        <NA> <NA>  62655.26 250.3103
> # Now calculate heritability using variance components
>   genotype.var=Ve[1,4]
>   error.var=Ve[2,4]
> # Now heritability
>   h2=genotype.var/(genotype.var+error.var)*100
>   h2
[1] 92.26417
> # Reliability
>   std.err<-se.ranef(model1.lme4)$Genotype
>   v_BLUP<- mean(std.err)
> # Heritability/Reliability 
>   h2<- (1-((v_BLUP)^2/(Ve[1,4]*2)))*100
>   h2
[1] 96.26181

5.3 Stage Wise/Combined Analysis


Model 2.lme4

  • Here we will analyze the two environments and extract the single BLUPs for each genotype. We will use mixed model analysis in lme4 r package model. We will treat genotypes as random.

5.3.1 Combined ANOVA

>   demo.data.out$Environment<-as.factor(demo.data.out$Environment)
>   model.anova<-lm(formula = Yield~Genotype+Environment+Genotype:Environment+Environment/Block,data=demo.data.out)
> # Get ANOVA
>   anova(model.anova)
Analysis of Variance Table

Response: Yield
                      Df    Sum Sq Mean Sq F value    Pr(>F)    
Genotype             343 554389253 1616295 28.2725 < 2.2e-16 ***
Environment            1   1049221 1049221 18.3532 6.092e-05 ***
Genotype:Environment 341  67400900  197657  3.4574 1.023e-08 ***
Environment:Block      6    505160   84193  1.4727    0.2012    
Residuals             66   3773115   57168                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

5.3.2 Check for Homogeneity of Variance

>   var.test(Yield~Environment,data=demo.data.out)

    F test to compare two variances

data:  Yield by Environment
F = 1.0719, num df = 379, denom df = 377, p-value = 0.4998
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
 0.875926 1.311767
sample estimates:
ratio of variances 
          1.071949 

5.3.3 Combined Analysis in lme4

  • The model we will use is give below:

\[ Y_{ij}= \mu + G_{i} + E_{j} + G_{i}*E_{j}+ B_{k}(E_{j})+ \varepsilon_{ijk}\\ Y_{ij}= \text{ is the effect of $i$th genotype in $j$th environment and $k$th block} \\ \mu= \text {overall mean}\\ G_{i}=\text{random effect of the $i$th genotype}\\ E_{j}= \text {fixed effect of $j$th environment}\\ G_{i}*G_{j}= \text {interaction effect of $i$th genotype in $j$th environment}\\ B_{k}(E_{j})= \text {effect of $k$th block nested with $j$th environment}\\ e_{ijk}=\text{error}\\ \text{here we assume residuals are independent and identically distributed }\varepsilon\sim \text{$iid$N}(0,\sigma_e^2)\\ \]


  • Mixed models are powerful tools to handle assumptions of linear model Read this one

  • We will extract variance components and also calculate heritability.

> demo.data.out$Environment<-as.factor(demo.data.out$Environment)
> Model2.lme4<-lmer(Yield ~Environment/Block+(1|Genotype)+(1|Environment)+(1|Genotype:Environment), data=demo.data.out)
> summary(Model2.lme4)
Linear mixed model fit by REML ['lmerMod']
Formula: Yield ~ Environment/Block + (1 | Genotype) + (1 | Environment) +  
    (1 | Genotype:Environment)
   Data: demo.data.out

REML criterion at convergence: 11898.4

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-2.00119 -0.26573 -0.01489  0.28043  1.80033 

Random effects:
 Groups               Name        Variance Std.Dev.
 Genotype:Environment (Intercept) 125837   354.7   
 Genotype             (Intercept) 619296   787.0   
 Environment          (Intercept)  63883   252.8   
 Residual                          56732   238.2   
Number of obs: 758, groups:  
Genotype:Environment, 686; Genotype, 344; Environment, 2

Fixed effects:
                       Estimate Std. Error t value
(Intercept)            4041.767    260.412  15.521
EnvironmentEnv2        -147.060    363.246  -0.405
EnvironmentEnv1:Block2  -33.050     65.447  -0.505
EnvironmentEnv2:Block2   60.946     65.541   0.930
EnvironmentEnv1:Block3    3.333     65.448   0.051
EnvironmentEnv2:Block3  115.206     65.613   1.756
EnvironmentEnv1:Block4   48.956     65.445   0.748
EnvironmentEnv2:Block4  234.575     65.559   3.578

Correlation of Fixed Effects:
            (Intr) EnvrE2 EE1:B2 EE2:B2 EE1:B3 EE2:B3 EE1:B4
EnvrnmntEn2 -0.697                                          
EnvrnmE1:B2 -0.126  0.089                                   
EnvrnmE2:B2 -0.004 -0.088  0.008                            
EnvrnmE1:B3 -0.126  0.085  0.499  0.041                     
EnvrnmE2:B3 -0.004 -0.088  0.018  0.501  0.028              
EnvrnmE1:B4 -0.126  0.088  0.499  0.015  0.499  0.015       
EnvrnmE2:B4 -0.003 -0.088  0.003  0.503  0.043  0.501  0.008
optimizer (nloptwrap) convergence code: 0 (OK)
unable to evaluate scaled gradient
Model failed to converge: degenerate  Hessian with 1 negative eigenvalues
> plot(Model2.lme4)

> # Extract the variance components
> Ve<- data.frame (VarCorr(Model2.lme4))
> Ve
                   grp        var1 var2      vcov    sdcor
1 Genotype:Environment (Intercept) <NA> 125837.07 354.7352
2             Genotype (Intercept) <NA> 619296.17 786.9537
3          Environment (Intercept) <NA>  63883.47 252.7518
4             Residual        <NA> <NA>  56732.25 238.1853
> # Heritability
> std.err<-se.ranef(Model2.lme4)$Genotype
> v_BLUP<- mean(std.err)
> # Heritability/Reliability 
> h2<- (1-((v_BLUP)^2/(Ve[2,4]*2)))*100
> h2
[1] 93.48238

6 Ranking of Genotypes and Correlations

  • Here in this section we will rank the genotypes based on the BLUPs extracted from the combined analysis in ASReml R package and select top 10% of genotypes and plot it as bar plot. Same thing can be done with BLUPs extracted from the lme4 R package.

  • We will also compare the rankings of genotypes based on BLUPs obtained in environment 1, environment 2 and combined analysis. We will see which genotypes are common in top 10% of lines all the three. We will save it in data.frame and also plot Venn Diagram.

  • We will also check the correlations between BLUPs in Env1, Env2 and combined one.

6.1 Ranking of BLUPs

  • Ranking of genotypes based the BLUPs extracted in Combined analysis
> # Ranking and selection of top performing lines
> # Subset only entries
>   blups.met.Genotype<-subset(blups.met, Line.type=="entry")
> # Get mean of entries and checks
>   Genotype.mean<-mean(blups.met.Genotype$blups.gy)
>   check.mean<-mean((subset(blups.met, Line.type=="check"))$blups.gy)
> # Arrange the BLUPs in decreasing order
>   blups.met.Genotype<-blups.met.Genotype%>%arrange(desc(blups.gy))
> # Select top 35 and merge with checks
>   blups.top25<-data.frame(rbind((blups.met.Genotype[1:35, ]), (subset(blups.met, Line.type=="check"))))
>   blups.top25<-droplevels.data.frame(blups.top25)
> # make factor unique to keep order of entries on plot
>   blups.top25$Genotype <- factor(blups.top25$Genotype, levels=unique(blups.top25$Genotype))
> # Draw the plot
>   bar.plot<-ggplot(data=blups.top25, aes(x=Genotype, y=blups.gy, fill=Line.type)) +
+   geom_bar(stat="identity", width=0.5)+
+   theme_classic()+
+     labs(title="BLUPs of Top Ranked Genotypes along with Checks",x="Genotypes", y = "BLUP value")+
+     #scale_y_continuous(limits = c(0, 6000), breaks = seq(0, 6000, by = 500))+
+     theme (plot.title = element_text(color="black", size=1, face="bold", hjust=0),
+            axis.title.x = element_text(color="black", size=10, face="bold"),
+            axis.title.y = element_text(color="black", size=10, face="bold")) +
+     theme(axis.text= element_text(color = "black", size = 8))+
+     geom_segment(aes(x = 1, y = Genotype.mean, xend = 35, yend =Genotype.mean), color="darkred", 
+                  linetype="dashed", size=1)+
+     geom_segment(aes(x = 36, y = check.mean, xend = 47, yend =check.mean), color="darkblue", 
+                  linetype="dashed", size=1)+
+     theme(axis.text.x = element_text(angle = 90, hjust = 1))
>     ggplotly(bar.plot)

Bar plot showing the BLUPs for top 10% of genotypes and all the checks. Dashed lines shows overall mean of all genotypes and checks. Genotypes differ slightly from checks and mean of entries and checks are almost close.

6.2 Correlation Between BLUPs

  • Here checking the correlation of BLUPs obtained in Environment 1, Environment 2 and Combined analysis
> # BLUPs in Environment 1
>     blups.env1<-subset(blups.all, Environment=="Env1", select =c(1,4))
>     colnames(blups.env1)<-c("Genotype", "BLUPs.Env1")
> # Blups in Environment 2
>     blups.env2<-subset(blups.all, Environment=="Env2", select =c(1,4))
>     colnames(blups.env2)<-c("Genotype", "BLUPs.Env2")
> # Now combined blups
>   blups.com<-blups.met[, c(1,2,3)]
>   colnames(blups.com)<-c("Genotype", "Line.type", "BLUPs.combined")
> # Merge all the BLUPs
>   blups.com.all<-merge((merge(blups.env1, blups.env2, by="Genotype")), (blups.com), by="Genotype")
>   str(blups.com.all)
'data.frame':   344 obs. of  5 variables:
 $ Genotype      : Factor w/ 344 levels "1","2","3","4",..: 1 10 100 101 102 103 104 105 106 107 ...
 $ BLUPs.Env1    : num  3303 3757 5260 2725 5273 ...
 $ BLUPs.Env2    : num  3010 3499 4018 3414 3977 ...
 $ Line.type     : chr  "entry" "entry" "entry" "entry" ...
 $ BLUPs.combined: num  3163 3559 4676 3047 4563 ...
>   corr.blup <- data.frame(round(cor(blups.com.all[,-c(1,4)]), 2))
>   print_table(corr.blup, rownames = TRUE, caption = htmltools::tags$caption("Correlation of BLUPs obtained in seperate analysis for environment 1, environment 2 and in combined analysis.", style="color:black; font-size:130%"))

6.3 Venn Diagram of Top Lines

  • Here we will look at the lines that are common in top 10% of lines in Environment 1, Environment 2 and combined analysis.
> # Combined blups
>   com.blups.top<-subset(blups.met, Line.type=="entry")
>   com.blups.top<-com.blups.top%>%arrange(desc(blups.gy))
>   com.blups.top<-com.blups.top[1:35,]
>   colnames(com.blups.top)[1]<-"Genotype.com"
> 
> # Blups in Environment 1
>   blups.env1<-subset(blups.all, Environment=="Env1", select =c(1,4))
>   blups.env1.top<-blups.env1%>%arrange(desc(blups.gy))
>   blups.env1.top<-blups.env1.top[1:35,]
>   colnames(blups.env1.top)[1]<-"Genotype.Env1"
> # Blups in Environment 2
>   blups.env2<-subset(blups.all, Environment=="Env2", select =c(1,4))
>   blups.env2.top<-blups.env2%>%arrange(desc(blups.gy))
>   blups.env2.top<-blups.env2.top[1:35,]
>   colnames(blups.env2.top)[1]<-"Genotype.Env2"
> # Now cbinb all the required columns
>   data.venn<-data.frame(cbind(Combined=com.blups.top$Genotype.com, Environment1=blups.env1.top$Genotype.Env1, Environment2=blups.env2.top$Genotype.Env2))
>   myCol <- brewer.pal(3, "Pastel2")
>   P<-venn.diagram(
+   x = list(data.venn$Combined, data.venn$Environment1, data.venn$Environment2),
+   category.names = c("Combined.BLUPs" , "Env1.BLUPs " , "Env2.BLUPs"),
+   filename = '~/Documents/GitHub/Analysis-pipeline/Codes/14_venn_diagramm.png',
+   output=TRUE,
+   # Output features
+   imagetype="png" ,
+   height = 1200 , 
+   width = 1200 , 
+   resolution = 500,
+   # Circles
+   lwd = 2,
+   lty = 'blank',
+   fill = myCol,
+   
+   # Numbers
+   cex = .8,
+   fontface = "bold",
+   fontfamily = "sans",
+   
+   # Set names
+   cat.cex = 0.4,
+   cat.fontface = "bold",
+   cat.default.pos = "outer",
+   cat.pos = c(-27, 27, 135),
+   cat.dist = c(0.055, 0.055, 0.085),
+   cat.fontfamily = "sans",
+   rotation = 1
+   
+ )
> P
[1] 1

Venn diagram showing list of lines that are common between separate analysis in environment 1, environment 2 and combined analysis. 17 top ranking genotypes were found common in environment 1, environment 2 and combined analysis and can be used for selection for broad adoptation

6.4 List of lines that are common in all the three

> overlap <- calculate.overlap(
+     x = list(data.venn$Combined, data.venn$Environment1, data.venn$Environment2)
+ )
> datatable(t(overlap$a5), rownames = TRUE, caption = htmltools::tags$caption("List of entries that are common between Environment 1, Environment 2, and in combined data analysis", style="color:black; font-size:130%"))

Table showing list of entries (genotype number is shown) that are common in all the three.

7 Final Remarks

  • Models accounting for spatial trends were more appropriate than models accounting just for experimental design factors.

  • Combined analysis performed would be more appropriate to rank lines, as done above.